In [7]:
from IPython.display import display, HTML
from io import StringIO
import requests
import json
import pandas as pd
%matplotlib inline
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
In [1]:
# The code was removed by DSX for sharing.
Out[1]:
[Row(=1, CASE_STATUS='CERTIFIED-WITHDRAWN', EMPLOYER_NAME='UNIVERSITY OF MICHIGAN', SOC_NAME='BIOCHEMISTS AND BIOPHYSICISTS', JOB_TITLE='POSTDOCTORAL RESEARCH FELLOW', FULL_TIME_POSITION='N', PREVAILING_WAGE='36067', YEAR='2016', WORKSITE='ANN ARBOR, MICHIGAN', lon='-83.7430378', lat='42.2808256'),
Row(=2, CASE_STATUS='CERTIFIED-WITHDRAWN', EMPLOYER_NAME='GOODMAN NETWORKS, INC.', SOC_NAME='CHIEF EXECUTIVES', JOB_TITLE='CHIEF OPERATING OFFICER', FULL_TIME_POSITION='Y', PREVAILING_WAGE='242674', YEAR='2016', WORKSITE='PLANO, TEXAS', lon='-96.6988856', lat='33.0198431'),
Row(=3, CASE_STATUS='CERTIFIED-WITHDRAWN', EMPLOYER_NAME='PORTS AMERICA GROUP, INC.', SOC_NAME='CHIEF EXECUTIVES', JOB_TITLE='CHIEF PROCESS OFFICER', FULL_TIME_POSITION='Y', PREVAILING_WAGE='193066', YEAR='2016', WORKSITE='JERSEY CITY, NEW JERSEY', lon='-74.0776417', lat='40.7281575'),
Row(=4, CASE_STATUS='CERTIFIED-WITHDRAWN', EMPLOYER_NAME='GATES CORPORATION, A WHOLLY-OWNED SUBSIDIARY OF TOMKINS PLC', SOC_NAME='CHIEF EXECUTIVES', JOB_TITLE='REGIONAL PRESIDEN, AMERICAS', FULL_TIME_POSITION='Y', PREVAILING_WAGE='220314', YEAR='2016', WORKSITE='DENVER, COLORADO', lon='-104.990251', lat='39.7392358'),
Row(=5, CASE_STATUS='WITHDRAWN', EMPLOYER_NAME='PEABODY INVESTMENTS CORP.', SOC_NAME='CHIEF EXECUTIVES', JOB_TITLE='PRESIDENT MONGOLIA AND INDIA', FULL_TIME_POSITION='Y', PREVAILING_WAGE='157518.4', YEAR='2016', WORKSITE='ST. LOUIS, MISSOURI', lon='-90.1994042', lat='38.6270025')]
In [19]:
df_data_3.head(10)
Out[19]:
[Row(=1, CASE_STATUS='CERTIFIED-WITHDRAWN', EMPLOYER_NAME='UNIVERSITY OF MICHIGAN', SOC_NAME='BIOCHEMISTS AND BIOPHYSICISTS', JOB_TITLE='POSTDOCTORAL RESEARCH FELLOW', FULL_TIME_POSITION='N', PREVAILING_WAGE='36067', YEAR='2016', WORKSITE='ANN ARBOR, MICHIGAN', lon='-83.7430378', lat='42.2808256'),
Row(=2, CASE_STATUS='CERTIFIED-WITHDRAWN', EMPLOYER_NAME='GOODMAN NETWORKS, INC.', SOC_NAME='CHIEF EXECUTIVES', JOB_TITLE='CHIEF OPERATING OFFICER', FULL_TIME_POSITION='Y', PREVAILING_WAGE='242674', YEAR='2016', WORKSITE='PLANO, TEXAS', lon='-96.6988856', lat='33.0198431'),
Row(=3, CASE_STATUS='CERTIFIED-WITHDRAWN', EMPLOYER_NAME='PORTS AMERICA GROUP, INC.', SOC_NAME='CHIEF EXECUTIVES', JOB_TITLE='CHIEF PROCESS OFFICER', FULL_TIME_POSITION='Y', PREVAILING_WAGE='193066', YEAR='2016', WORKSITE='JERSEY CITY, NEW JERSEY', lon='-74.0776417', lat='40.7281575'),
Row(=4, CASE_STATUS='CERTIFIED-WITHDRAWN', EMPLOYER_NAME='GATES CORPORATION, A WHOLLY-OWNED SUBSIDIARY OF TOMKINS PLC', SOC_NAME='CHIEF EXECUTIVES', JOB_TITLE='REGIONAL PRESIDEN, AMERICAS', FULL_TIME_POSITION='Y', PREVAILING_WAGE='220314', YEAR='2016', WORKSITE='DENVER, COLORADO', lon='-104.990251', lat='39.7392358'),
Row(=5, CASE_STATUS='WITHDRAWN', EMPLOYER_NAME='PEABODY INVESTMENTS CORP.', SOC_NAME='CHIEF EXECUTIVES', JOB_TITLE='PRESIDENT MONGOLIA AND INDIA', FULL_TIME_POSITION='Y', PREVAILING_WAGE='157518.4', YEAR='2016', WORKSITE='ST. LOUIS, MISSOURI', lon='-90.1994042', lat='38.6270025'),
Row(=6, CASE_STATUS='CERTIFIED-WITHDRAWN', EMPLOYER_NAME='BURGER KING CORPORATION', SOC_NAME='CHIEF EXECUTIVES', JOB_TITLE='EXECUTIVE V P, GLOBAL DEVELOPMENT AND PRESIDENT, LATIN AMERI', FULL_TIME_POSITION='Y', PREVAILING_WAGE='225000', YEAR='2016', WORKSITE='MIAMI, FLORIDA', lon='-80.1917902', lat='25.7616798'),
Row(=7, CASE_STATUS='CERTIFIED-WITHDRAWN', EMPLOYER_NAME='BT AND MK ENERGY AND COMMODITIES', SOC_NAME='CHIEF EXECUTIVES', JOB_TITLE='CHIEF OPERATING OFFICER', FULL_TIME_POSITION='Y', PREVAILING_WAGE='91021', YEAR='2016', WORKSITE='HOUSTON, TEXAS', lon='-95.3698028', lat='29.7604267'),
Row(=8, CASE_STATUS='CERTIFIED-WITHDRAWN', EMPLOYER_NAME='GLOBO MOBILE TECHNOLOGIES, INC.', SOC_NAME='CHIEF EXECUTIVES', JOB_TITLE='CHIEF OPERATIONS OFFICER', FULL_TIME_POSITION='Y', PREVAILING_WAGE='150000', YEAR='2016', WORKSITE='SAN JOSE, CALIFORNIA', lon='-121.8863286', lat='37.3382082'),
Row(=9, CASE_STATUS='CERTIFIED-WITHDRAWN', EMPLOYER_NAME='ESI COMPANIES INC.', SOC_NAME='CHIEF EXECUTIVES', JOB_TITLE='PRESIDENT', FULL_TIME_POSITION='Y', PREVAILING_WAGE='127546', YEAR='2016', WORKSITE='MEMPHIS, TEXAS', lon='NA', lat='NA'),
Row(=10, CASE_STATUS='WITHDRAWN', EMPLOYER_NAME='LESSARD INTERNATIONAL LLC', SOC_NAME='CHIEF EXECUTIVES', JOB_TITLE='PRESIDENT', FULL_TIME_POSITION='Y', PREVAILING_WAGE='154648', YEAR='2016', WORKSITE='VIENNA, VIRGINIA', lon='-77.2652604', lat='38.9012225')]
In [20]:
(df_data_3[['CASE_STATUS', 'SOC_NAME']]).show()
#use this to make a chart with two indexes
+-------------------+--------------------+
| CASE_STATUS| SOC_NAME|
+-------------------+--------------------+
|CERTIFIED-WITHDRAWN|BIOCHEMISTS AND B...|
|CERTIFIED-WITHDRAWN| CHIEF EXECUTIVES|
|CERTIFIED-WITHDRAWN| CHIEF EXECUTIVES|
|CERTIFIED-WITHDRAWN| CHIEF EXECUTIVES|
| WITHDRAWN| CHIEF EXECUTIVES|
|CERTIFIED-WITHDRAWN| CHIEF EXECUTIVES|
|CERTIFIED-WITHDRAWN| CHIEF EXECUTIVES|
|CERTIFIED-WITHDRAWN| CHIEF EXECUTIVES|
|CERTIFIED-WITHDRAWN| CHIEF EXECUTIVES|
| WITHDRAWN| CHIEF EXECUTIVES|
|CERTIFIED-WITHDRAWN| CHIEF EXECUTIVES|
|CERTIFIED-WITHDRAWN| CHIEF EXECUTIVES|
|CERTIFIED-WITHDRAWN| CHIEF EXECUTIVES|
|CERTIFIED-WITHDRAWN| CHIEF EXECUTIVES|
|CERTIFIED-WITHDRAWN| CHIEF EXECUTIVES|
|CERTIFIED-WITHDRAWN| CHIEF EXECUTIVES|
|CERTIFIED-WITHDRAWN| FINANCIAL MANAGERS|
|CERTIFIED-WITHDRAWN| CHIEF EXECUTIVES|
| CERTIFIED| CHIEF EXECUTIVES|
| CERTIFIED| CHIEF EXECUTIVES|
+-------------------+--------------------+
only showing top 20 rows
In [24]:
df_data_3.select('SOC_NAME').count()
Out[24]:
3002458
In [5]:
df_data_2['SOC_NAME'].value_counts().count()
Out[5]:
2132
In [6]:
testdata=df_data_2.copy()
In [7]:
testdata['SOC_NAME']=df_data_2['SOC_NAME'].str.lower()
testdata['EMPLOYER_NAME']=df_data_2['EMPLOYER_NAME'].str.lower()
testdata['JOB_TITLE']=df_data_2['JOB_TITLE'].str.lower()
Here, we transform some strings to lowercase. This is because there are duplicate entries in the dataset which in both upper and lower. This increases redundancy
In [8]:
testdata['SOC_NAME'].value_counts()
Out[8]:
computer systems analysts 506523
software developers, applications 414716
computer programmers 398546
computer occupations, all other 177870
software developers, systems software 83856
management analysts 66873
financial analysts 53540
accountants and auditors 52822
mechanical engineers 44236
network and computer systems administrators 40087
database administrators 38982
market research analysts and marketing specialists 37737
electronics engineers, except computer 36574
operations research analysts 34260
electrical engineers 34108
physicians and surgeons, all other 33526
computer software engineers, applications 33387
computer and information systems managers 27536
computer occupations, all other* 26254
medical scientists, except epidemiologists 26159
physical therapists 21994
biochemists and biophysicists 21245
industrial engineers 19370
computer systems analyst 17426
statisticians 17101
biological scientists, all other 16367
marketing managers 16310
civil engineers 15970
web developers 15000
internists, general 13367
...
helpers--electricians 1
industrial designers 1
logistics manager 1
electronics engineers, expert computer 1
hardward engineers 1
soil and plant scientist 1
mechanical engineers, r&d (acwia only) 1
timing device assemblers and adjusters 1
telecommunications line installers and repairers 1
biological science teachers 1
dentist 1
computer systems engineers/arquitects 1
secondary school teachers, 1
computer systems anaysts 1
counseling psychologist 1
software engineer, senior 1
health diagnosing & treating practitioners, all ot 1
set and exhibit engineers 1
helpers, construction trades, all other 1
datebase administrators 1
commercial and insdistrial designers 1
software developers, appllications 1
computer occupations,all other 1
foreign language and literature teachers, postsecondary 1
dentists 1
railroad conductors and yardmasters 1
computer occupations, all other: information techn 1
computer network architect 1
criminal justice and law enforcement teachers, postsecondary 1
medical scientists, except epidemiologist 1
Name: SOC_NAME, dtype: int64
There is still alot of redundancy we can exploit. We can generalize these strings remove specalized strings into more general form. For example 'software engineer, senior' needs to be reduced to 'software engineer'. This also applies to the other columns with string attributes.
In [9]:
testdata['SOC_NAME'].value_counts().count()
Out[9]:
1585
In [10]:
df = pd.DataFrame({'SOC_NAME': testdata['SOC_NAME'].value_counts().index, 'Count':testdata['SOC_NAME'].value_counts().values})
#df['Counts'] = df.groupby(['SOC_NAME'])['Count'].transform('count') #I don't remember what I was trying to do here.
df = df.set_index(['SOC_NAME'])
df
Out[10]:
Count
SOC_NAME
computer systems analysts
506523
software developers, applications
414716
computer programmers
398546
computer occupations, all other
177870
software developers, systems software
83856
management analysts
66873
financial analysts
53540
accountants and auditors
52822
mechanical engineers
44236
network and computer systems administrators
40087
database administrators
38982
market research analysts and marketing specialists
37737
electronics engineers, except computer
36574
operations research analysts
34260
electrical engineers
34108
physicians and surgeons, all other
33526
computer software engineers, applications
33387
computer and information systems managers
27536
computer occupations, all other*
26254
medical scientists, except epidemiologists
26159
physical therapists
21994
biochemists and biophysicists
21245
industrial engineers
19370
computer systems analyst
17426
statisticians
17101
biological scientists, all other
16367
marketing managers
16310
civil engineers
15970
web developers
15000
internists, general
13367
...
...
helpers--electricians
1
industrial designers
1
logistics manager
1
electronics engineers, expert computer
1
hardward engineers
1
soil and plant scientist
1
mechanical engineers, r&d (acwia only)
1
timing device assemblers and adjusters
1
telecommunications line installers and repairers
1
biological science teachers
1
dentist
1
computer systems engineers/arquitects
1
secondary school teachers,
1
computer systems anaysts
1
counseling psychologist
1
software engineer, senior
1
health diagnosing & treating practitioners, all ot
1
set and exhibit engineers
1
helpers, construction trades, all other
1
datebase administrators
1
commercial and insdistrial designers
1
software developers, appllications
1
computer occupations,all other
1
foreign language and literature teachers, postsecondary
1
dentists
1
railroad conductors and yardmasters
1
computer occupations, all other: information techn
1
computer network architect
1
criminal justice and law enforcement teachers, postsecondary
1
medical scientists, except epidemiologist
1
1585 rows × 1 columns
In [11]:
df['New_Name'] = ''
In [ ]:
df
Out[ ]:
Count
New_Name
SOC_NAME
computer systems analysts
506523
software developers, applications
414716
computer programmers
398546
computer occupations, all other
177870
software developers, systems software
83856
management analysts
66873
financial analysts
53540
accountants and auditors
52822
mechanical engineers
44236
network and computer systems administrators
40087
database administrators
38982
market research analysts and marketing specialists
37737
electronics engineers, except computer
36574
operations research analysts
34260
electrical engineers
34108
physicians and surgeons, all other
33526
computer software engineers, applications
33387
computer and information systems managers
27536
computer occupations, all other*
26254
medical scientists, except epidemiologists
26159
physical therapists
21994
biochemists and biophysicists
21245
industrial engineers
19370
computer systems analyst
17426
statisticians
17101
biological scientists, all other
16367
marketing managers
16310
civil engineers
15970
web developers
15000
internists, general
13367
...
...
...
helpers--electricians
1
industrial designers
1
logistics manager
1
electronics engineers, expert computer
1
hardward engineers
1
soil and plant scientist
1
mechanical engineers, r&d (acwia only)
1
timing device assemblers and adjusters
1
telecommunications line installers and repairers
1
biological science teachers
1
dentist
1
computer systems engineers/arquitects
1
secondary school teachers,
1
computer systems anaysts
1
counseling psychologist
1
software engineer, senior
1
health diagnosing & treating practitioners, all ot
1
set and exhibit engineers
1
helpers, construction trades, all other
1
datebase administrators
1
commercial and insdistrial designers
1
software developers, appllications
1
computer occupations,all other
1
foreign language and literature teachers, postsecondary
1
dentists
1
railroad conductors and yardmasters
1
computer occupations, all other: information techn
1
computer network architect
1
criminal justice and law enforcement teachers, postsecondary
1
medical scientists, except epidemiologist
1
1585 rows × 2 columns
In [ ]:
%timeit
for index, row in testdata.iterrows():
if(str(row['SOC_NAME']).endswith('*')):
row['SOC_NAME']=str(row['SOC_NAME'])[:-1]
if not (str(row['SOC_NAME']).endswith('s')):
row['SOC_NAME']=str(row['SOC_NAME'])+'s'
#if row=='software developers, appllications': #in progress
#print(row)
In [ ]:
(testdata.loc[(testdata['SOC_NAME']=='software developers, appllications')]) #an example of a query
This person messed up the SOC_NAME
Content source: JonathonBeauregardII/Advanced-Data-Mining-Project-IBM
Similar notebooks: